/*******************************************************************************
																				
	DESCRIPTION: 	This do file creates the main dataset used for the analysis of 
					long term unemployment.
	
*******************************************************************************/

clear all
global id_code 001_9

********************************************************************************
* A1: Merge PES data with LISA data
********************************************************************************
use "${data}/001_1_UnemploymentSpells.dta", clear

merge m:1 LopNr_PersonNr year using "${data}/001_2_LISA_clean.dta"
keep if _merge == 3
drop _merge  // 11,997,029 observations matched

sort LopNr_PersonNr year InLnr
compress

save "${data}/${id_code}_LISA_PES_cleaned.dta", replace


********************************************************************************
* A2: Checking the number of missing values
********************************************************************************

use "${data}/${id_code}_LISA_PES_cleaned.dta", clear

foreach var of varlist any_LT_spells_5Y - L_Occupation_3D_L1L2 {
	gen m_`var' = (`var'==.)
}

collapse (count) any_LT_spells_5Y - L_Occupation_3D_L1L2 (sum) m_any_LT_spells_5Y - m_L_Occupation_3D_L1L2, by(year)

foreach var of varlist any_LT_spells_5Y - L_Occupation_3D_L1L2  {
by year: gen r_`var' = m_`var' / (`var' + m_`var')
drop m_`var' `var'
}

foreach var of varlist r_any_LT_spells_5Y - r_L_Occupation_3D_L1L2 {
	replace `var' = round(`var', 0.01)
}

save "${data}/${id_code}_Share_non_missing_allYears.dta", replace

********************************************************************************
* A3: Generating a variable indicating years between migration and unemployment
********************************************************************************

use "${data}/${id_code}_LISA_PES_cleaned.dta", clear

* Generate var years since migration
gen ySinceMigrat= year-yearMigrated
replace ySinceMigrat = 9999 if yearMigrated==0
label var ySinceMigrat "years since migration"
drop yearMigrated

********************************************************************************
* A4: Creating a dummy for missing values and replacing missing values
********************************************************************************

* Replace missing categorical ordinal variables with the median and a dummy to indicate that it was previously missing
foreach var of varlist EducLevel L_N_Kids L_N_Kids_U18 L_Age_Youngest nEmployers1Y-nEmployers5Y tenure  {
	gen `var'_Missing=(`var'==.)
	bys year: egen `var'temp=median(`var') if `var'!=.
	by year: egen `var'temp2=mean(`var'temp)
	by year: replace `var'=`var'temp2 if `var'==.
	drop `var'temp `var'temp2
}

* Replace missing categorical non-ordinal variables with the mode and a dummy to indicate that it was previously missing
foreach var of varlist foreign L_civilStatus L_emplStatu L_Municipality L_Industry_3digit L_Occupation_3D_L1L2 citizenship {
	gen `var'_Missing=(`var'==.)
	bys year: egen `var'temp=mode(`var') if `var'!=.
	by year: egen `var'temp2=mean(`var'temp)
	by year: replace `var'=`var'temp2 if `var'==.
	drop `var'temp `var'temp2
}

* Replace missing continuous variables with the mean and a dummy to indicate that it was previously missing
foreach var in L_layoffRate_L1L2 L_nEmployees_L1L2 {
	gen `var'Missing=(`var'==.)
	bys year: egen `var'temp=mean(`var') if `var'!=.
	by year: egen `var'temp2=mean(`var'temp)
	by year: replace `var'=`var'temp2 if `var'==.
	drop `var'temp `var'temp2
}

/* Replace missing continuous variables with the mean and a dummy to indicate that it was previously missing - for the years where all values are missing replace the variable with the 
mean from the first non-missing year for all the relevant variables (we want to use one year only rather than select a year for every variable) */
foreach var of varlist DaysOnDI_1Years-DaysOnDI_5Years ///
						DaysUnemp_1Years-DaysUnemp_5Years ///
						OtherInc_adj_L2_L5 WageInc_adj_L2_L5 FamInc_adj_L2_L5 ///
						L_FamInc_adj L_WageInc_adj L_OtherInc_adj L_firmSizeChange_L1L2 {
	gen `var'Missing=(`var'==.)
	bys year: egen `var'temp=mean(`var') if `var'!=.
	by year: egen `var'temp2=mean(`var'temp)
	by year: replace `var'=`var'temp2 if (`var'==. & year>1994)
	gen `var'temp3 = `var'temp2 if year==1995
	egen `var'temp4 = mean(`var'temp3)
	replace `var'=`var'temp4 if (`var'==. & year<=1994)
	drop `var'tem*
}

* Generate dummies that indicate that a certain variable is zero
foreach var in L_WageInc_adj L_OtherInc_adj L_FamInc_adj {
	gen `var'0 =(`var'==0)
	label var `var'0 " 1 if `var' = 0 "
}

foreach var in OtherInc WageInc FamInc {
	gen `var'_L2_L5_adj0=(`var'_adj_L2_L5==0)
	label var `var'_L2_L5_adj0 " 1 if `var'_adj_L2_L5 = 0 "
}

save "${data}/${id_code}_FinalMainDataset.dta", replace

********************************************************************************
* A5: Verify that no observations in years from 1992 have missing values
********************************************************************************

pause off
preserve
drop L_Occupation*
missings tag any_LT_spells_5Y-FamInc_L2_L5_adj0 , gen(any_missing)
pause
assert any_missing ==0 if year>=1991
restore

********************************************************************************
* A6: Creating samples 
********************************************************************************

use "${data}/${id_code}_FinalMainDataset.dta", clear

gen inSample_Full = (age>=25 & age<=55)
label var inSample_Full "Main sample age 25-55"

gen inSample_Full_F = (foreign==0 & age>=25 & age<=55)
label var inSample_Full_F "Main sample excluding foreigners"

gen inSample_Full_A = (age>=18 & age<=65)
label var inSample_Full_A "Sample age 18-65"

gen inSample_Full_NT = (age>=25 & age<=55 & training_in6M==0)
label var inSample_Full_NT "Sample age 25-55, excl. indiv. who enter training in first 6M"

gen inSample_Full_LT = (age>=25 & age<=55)
label var inSample_Full_LT "Main sample age 25-55, plus var indicating LT history"

save "${data}/${id_code}_FinalMainDataset.dta", replace

********************************************************************************
* A7: Check the sample size
********************************************************************************

use "${data}/${id_code}_FinalMainDataset.dta", clear

* Obtain size of each of the samples
log using "${output}/${id_code}_SizeSampleModels.log", replace

foreach model in Full Full_F Full_A Full_NT Full_LT {
	
	di "`model'"
	count if inSample_`model'==1

}

log close

********************************************************************************
* A8: Adding expanded versions of the model
********************************************************************************

use "${data}/${id_code}_FinalMainDataset.dta", clear

* Preserve order, because we use it later to split into samples and we want 
* exactly the same order as in the baseline analysis so that the results are 
* comparable:
gen ord = _n

	******************************************************************
	* A8.1: Merge with wage data
	******************************************************************

	merge m:1 LopNr_PersonNr year using "${data_intermediate}\001_05_Wages_clean.dta"
	drop if _merge==2
	drop _merge

	******************************************************************
	* A8.2: Merge with wealth data
	******************************************************************
	merge m:1 LopNr_PersonNr year using "${data_intermediate}\001_04_Wealth_clean.dta"
	drop if _merge == 2
	drop _merge

	******************************************************************
	* A8.3: Merge with additional insurance data
	******************************************************************
	merge m:1 LopNr_PersonNr year using "${data_intermediate}\001_08_UI_final.dta"
	drop if _merge == 2
	drop _merge

	******************************************************************
	* A8.4: Merge with replacement ratio
	******************************************************************
	gen month=month(startU)
	merge m:1 LopNr_PersonNr year month using "${data_intermediate}\001_07_UnempBenefits_clean_alt.dta"
	drop if _merge == 2
	drop _merge

	******************************************************************
	* A8.5: Merge with IQ data
	******************************************************************
	merge m:1 LopNr_PersonNr using "${data_intermediate}\001_10_IQData_clean.dta"
	drop if _merge == 2
	drop _merge
	
	******************************************************************
	* A8.6: Merge with union membership data
	******************************************************************
	merge m:1 LopNr_PersonNr year using "${data_intermediate}\001_8_unionMember.dta"
	drop if _merge == 2
	drop _merge
	
	******************************************************************
	* A8.7: Save the expanded dataset
	******************************************************************
	compress
	save "${data}/${id_code}_FinalMainDataset_expanded.dta", replace
	
	******************************************************************
	* A8.8: Splitting data into samples
	******************************************************************

	* generate a variable indicating that none of the variables in the model is missing

	foreach model in EX_WA EX_UI EX_WE EX_OC EX_RR EX_IQ EX_UM {
		
				local wealth "L_NetWealth L_NetWealth0 L_Liabilities L_Liabilities0 L_BankAccount L_BankAccount0 L_RealEstate0 L_RealEstate"
				
				local UI "L_additionalUI"
				
				local wage "L1_monthlyWage L1_percenFullTimeNew L1_percenFullTimeMiss"
				
				local occup "L_Occupation_3D_L1L2 L_Occupation_3D_L1L2_Missing"
				
				local levelUI "replacRatio_new"
				
				local IQ "cognit_dummy1 cognit_dummy2 cognit_dummy3 non_cognit_dummy1 non_cognit_dummy2 non_cognit_dummy3"
				
				local union "L_unionMember"

	local varEX_WE $demoEdu $migHist $mun $incIndiv $incOther $incHist $indu $emplHist `wealth'
	local varEX_UI $demoEdu $migHist $mun $incIndiv $incOther $incHist $indu $emplHist `UI'
	local varEX_WA $demoEdu $migHist $mun $incIndiv $incOther $incHist $indu $emplHist `wage'
	local varEX_OC $demoEdu $migHist $mun $incIndiv $incOther $incHist $indu $emplHist `occup'
	local varEX_RR $demoEdu $migHist $mun $incIndiv $incOther $incHist $indu $emplHist `levelUI'
	local varEX_IQ $demoEdu $migHist $mun $incIndiv $incOther $incHist $indu $emplHist `IQ'
	local varEX_UM $demoEdu $migHist $mun $incIndiv $incOther $incHist $indu $emplHist `union'
	local varEX_EH `emplHistory'

	* Start from Full sample:
	gen inSample_`model'= inSample_Full

		foreach var of varlist `var`model'' {
			replace inSample_`model' = 0 if `var' == .
		}
	}
	
	save "${data}/${id_code}_FinalMainDataset_expanded.dta", replace

	******************************************************************
	* A8.9: Check the sample size
	******************************************************************
	
	log using "${output}/${id_code}_SizeSampleModels_expanded.log", replace

	foreach model in EX_WA EX_UI EX_WE EX_OC EX_RR EX_IQ EX_UM {
		
		count if inSample_`model'==1

	}

	log close
	
	********************************************************************************
	* A8.10: Creating a dummy for missing values and replacing missing values
	********************************************************************************

	* Replace missing categorical non-ordinal variables with the mode and a dummy to indicate that it was previously missing

	foreach var of varlist L_additionalUI L_unionMember ///
		cognit_dummy1 cognit_dummy2 cognit_dummy3 ///
		non_cognit_dummy1 non_cognit_dummy2 non_cognit_dummy3 {
		gen `var'_Missing=(`var'==.)
		bys year: egen `var'temp=mode(`var') if `var'!=.
		by year: egen `var'temp2=mean(`var'temp)
		by year: replace `var'=`var'temp2 if `var'==.
		drop `var'temp `var'temp2
	}
	
	* We only need one missing dummy for the IQ data:
	egen cognit_dummyMissing = anymatch(cognit_dummy1_Missing cognit_dummy2_Missing ///
		cognit_dummy3_Missing), values(1)
		
	egen non_cognit_dummyMissing = anymatch(non_cognit_dummy1_Missing non_cognit_dummy2_Missing ///
		non_cognit_dummy3_Missing), values(1)
		
	drop cognit_dummy1_Missing cognit_dummy2_Missing cognit_dummy3_Missing ///
		non_cognit_dummy1_Missing non_cognit_dummy2_Missing non_cognit_dummy3_Missing

	* Replace missing continuous variables with the mean and a dummy to indicate that it was previously missing
	foreach var in L_NetWealth L_Liabilities L_BankAccount L_RealEstate ///
		replacRatio_new L1_monthlyWage L1_percenFullTimeNew {
		gen `var'Missing=(`var'==.)
		bys year: egen `var'temp=mean(`var') if `var'!=.
		by year: egen `var'temp2=mean(`var'temp)
		by year: replace `var'=`var'temp2 if `var'==.
		drop `var'temp `var'temp2
	}

	* Generate dummies that indicate that a certain variable is zero
	drop L_NetWealth0 L_Liabilities0 L_BankAccount0 L_RealEstate0
	
	foreach var in L_NetWealth L_Liabilities L_BankAccount L_RealEstate {
		gen `var'0 =(`var'==0)
		label var `var'0 " 1 if `var' = 0 "
	}

	* Restore order:
	sort ord
	drop ord
	
	* Save:
	save "${data}/${id_code}_FinalMainDataset_expanded.dta", replace	
